package com.chance.cc.crawler.development.bootstrap.wangyi.health;

import com.alibaba.fastjson.JSON;
import com.chance.cc.crawler.core.CrawlerEnum;
import com.chance.cc.crawler.core.downloader.HttpConfig;
import com.chance.cc.crawler.core.downloader.proxy.Proxy;
import com.chance.cc.crawler.core.filter.FilterUtils;
import com.chance.cc.crawler.core.record.CrawlerRecord;
import com.chance.cc.crawler.core.record.CrawlerRequestRecord;
import com.chance.cc.crawler.development.controller.DevCrawlerController;
import org.apache.commons.lang3.StringUtils;

import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.filter;

/**
 * @author lt
 * @version 1.0
 * @date 2021-02-08 10:55:49
 * @email okprog@sina.com
 */
public class WYHealthStart {
    private static final String domain = "wangyi";

    private static Proxy proxy = new Proxy();
    static {
        //代理配置
        //HL89Q19E86E2987D
        //71F33D94CE5F7BF2
        proxy.setHost("http-dyn.abuyun.com");
        proxy.setPort(9020);
        proxy.setUsername("HL89Q19E86E2987D");
        proxy.setPassword("71F33D94CE5F7BF2");
    }

    public static void main(String[] args) {
        //设置https协议访问
        System.setProperty("https.protocols", "TLSv1,TLSv1.1,TLSv1.2,SSLv3");

        //文章采集
        CrawlerRequestRecord articleCrawler = doSearchNewsCrawler();
        CrawlerRequestRecord sportsArticleCrawler = doSportsNewsCrawler();
        CrawlerRequestRecord articleRecord = doArticleRecord();

        DevCrawlerController devCrawlerController = DevCrawlerController.builder()
                .triggerInfo(domain, domain + "_trigger", System.currentTimeMillis(), domain + "_job")
                .crawlerRequestQueue(DevCrawlerController.devRequestQueue(domain)) //内存队列
                .consoleResultPipeline() //控制台输t出
//                .fileResultPipeline("D:\\chance\\data\\wangyi\\wangyi_article_test.json",false)
                .requestRecord(sportsArticleCrawler)  //more job
//                .crawlerThreadNum(5)
                .build();
        devCrawlerController.getCrawlerJob().getScheduleTags().getCategoryTag().addLabelTag(CrawlerEnum.CrawlerDataType.comment.enumVal());
        devCrawlerController.start();
    }

    private static CrawlerRequestRecord doArticleRecord(){
        String site = "sport";
        String url = "https://www.163.com/sports/article/GCVK5AK000058780.html";

        CrawlerRequestRecord requestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPageItem)
                .domain(domain)
                .recordKey(url)
                .httpUrl(url)
                .releaseTime(System.currentTimeMillis())
                .httpConfig(HttpConfig.me(domain))
                .filter(CrawlerEnum.CrawlerRecordFilter.key)
                .addFilterInfo(FilterUtils.memoryFilterKeyInfo(domain))
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24*7,null))
                .proxy(proxy)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.article)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.interaction)
                .build();


        requestRecord.tagsCreator().bizTags().addDomain(domain);
        requestRecord.tagsCreator().bizTags().addSite(site);
        requestRecord.tagsCreator().scheduleTags().getCategoryTag().addLabelTag(CrawlerEnum.CrawlerDataType.comment.enumVal());

        CrawlerRecord filterCrawlerRecord = new CrawlerRecord();
        filterCrawlerRecord.setFilter(CrawlerEnum.CrawlerRecordFilter.dateRange);
        filterCrawlerRecord.addFilterInfo(FilterUtils.memoryFilterKeyInfo(StringUtils.joinWith("-",filter,domain,"comment")));
        filterCrawlerRecord.addFilterInfo(FilterUtils.dateRangeFilterInfo(24,null));
        requestRecord.tagsCreator().bizTags().addCustomKV("comment_record_filter_info", JSON.toJSONString(filterCrawlerRecord));
        requestRecord.getHttpRequest().addExtra("listUrl","1111");
        return requestRecord;
    }

    public static CrawlerRequestRecord doSearchNewsCrawler(){
        String site = "health";

        CrawlerRequestRecord requestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPage)
                .domain(domain)
                .recordKey("https://jiankang.163.com/")
                .httpUrl("https://jiankang.163.com/")
                .releaseTime(System.currentTimeMillis())
                .httpConfig(HttpConfig.me(domain))
                .filter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange)
                .addFilterInfo(FilterUtils.memoryFilterKeyInfo(domain))
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24*7,null))
                .proxy(proxy)
                .needParsed(true)
                .needWashed(false)
                .build();

        requestRecord.tagsCreator().bizTags().addDomain(domain);
        requestRecord.tagsCreator().bizTags().addSite(site);
        requestRecord.getHttpRequest().addExtra("listUrl","1111");
        return requestRecord;
    }

    public static CrawlerRequestRecord doSportsNewsCrawler(){
        String site = "sports";

        CrawlerRequestRecord requestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPage)
                .domain(domain)
                .recordKey("http://sports.163.com/special/0005rt/sportsgd.html")
                .httpUrl("http://sports.163.com/special/0005rt/sportsgd.html")
                .releaseTime(System.currentTimeMillis())
                .httpConfig(HttpConfig.me(domain))
                .filter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange)
                .addFilterInfo(FilterUtils.memoryFilterKeyInfo(domain))
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24*7,null))
                .proxy(proxy)
                .needParsed(true)
                .needWashed(false)
                .build();
        CrawlerRecord filterCrawlerRecord = new CrawlerRecord();
        filterCrawlerRecord.setFilter(CrawlerEnum.CrawlerRecordFilter.dateRange);
        filterCrawlerRecord.addFilterInfo(FilterUtils.memoryFilterKeyInfo(StringUtils.joinWith("-",filter,domain,"comment")));
        filterCrawlerRecord.addFilterInfo(FilterUtils.dateRangeFilterInfo(24,null));
        requestRecord.tagsCreator().bizTags().addCustomKV("comment_record_filter_info", JSON.toJSONString(filterCrawlerRecord));
        requestRecord.tagsCreator().bizTags().addDomain(domain);
        requestRecord.tagsCreator().bizTags().addSite(site);
        return requestRecord;
    }

}
